LIZA MARIE SORIANO
import sys
import pandas as pd
import numpy as np
import json
import altair as alt
from altair.expr import datum, if_
alt.renderers.enable('colab')
!pip install geopandas
import geopandas as gpd
# Color Theme
blue = "#578ceb"
pink = "#ff5aaa"
yellow = "#ffb14e"
mint = "#99d8c9"
purple = "#7854b3"
teal = "#069695"
dark_pink = "#be176f"
# For Altair
#theme_range = [blue, pink, yellow, mint, purple, teal, dark_pink]
theme_range = [yellow, blue, pink, dark_pink, purple, teal, mint]
sequential_palette = ["#cfe8f3",
"#a2d4ec",
"#73bfe2",
"#46abdb",
"#1696d2",
"#12719e"
]
# Adapted from https://towardsdatascience.com/consistently-beautiful-visualizations-with-altair-themes-c7f9f889602
def custom_theme():
# Typography
font = "Georgia"
labelFont = "Palatino"
sourceFont = "Palatino"
# Colors
main_palette = theme_range
sequential_palette = ["#cfe8f3",
"#a2d4ec",
"#73bfe2",
"#46abdb",
"#1696d2",
"#12719e"
]
return {
# width and height are configured outside the config dict because they are Chart configurations/properties not chart-elements' configurations/properties.
"width": 685, # from the guide
"height": 380, # not in the guide
"config": {
"title": {
"fontSize": 18,
"font": font,
"anchor": "middle",
"fontColor": teal
},
"range": {
"category": main_palette,
"diverging": sequential_palette,
},
"legend": {
"labelFont": labelFont,
"labelFontSize": 12,
"symbolSize": 100, # default
"titleFont": font,
"titleFontSize": 12,
"rowPadding": 4,
"orient": "right",
"offset": 15,
},
}
}
cps_df.head()
# Why we need additional data to look at outcomes
print(sum(cps_df['Graduation_Rate_School'].isna()), "schools out of", cps_df.shape[0], "are missing graduation rates.")
Year in CPS_5yrGrad refers to cohort who were freshman 5 years before that year, e.g. [Rate_Grad_5yr_2019] is % of students who were 9th-graders in SY 2014-15 that graduated by SY 2018-19 (which theoretically would also include all students who graduated SY 2017-18, i.e. after 4 years).
Let's grab cohorts that would normally graduate SY 2018-19 as well as the cohort right before. (Thus, freshmen during 2014-15 and 2015-16.)
# Read in data
grad_df.head()
# Check that new dataframe's shape makes sense
print("New cps_df:", cps_df.shape)
cps_df.head()
aggs_df
lowinc_v_white = alt.Chart(cps_df).mark_point().encode(
x=alt.X('%low_inc', axis=alt.Axis(format='%', title='% Low Income')),
y=alt.Y('%white', axis=alt.Axis(format='%', title='% White')),
color='Rating_Status',
shape='Rating_Status'
).transform_filter(
datum.Rating_Status != 'NOT APPLICABLE'
)
#lowinc_v_white
lowinc_v_hisp = alt.Chart(cps_df).mark_point().encode(
x=alt.X('%low_inc', axis=alt.Axis(format='%', title='% Low Income')),
y=alt.Y('%hisp', axis=alt.Axis(format='%', title='% Hispanic')),
color='Rating_Status',
shape='Rating_Status'
).transform_filter(
datum.Rating_Status != 'NOT APPLICABLE'
)
#lowinc_v_hisp
lowinc_v_black = alt.Chart(cps_df).mark_point().encode(
x=alt.X('%low_inc', axis=alt.Axis(format='%', title='% Low Income')),
y=alt.Y('%black', axis=alt.Axis(format='%', title='% Black')),
color='Rating_Status',
shape='Rating_Status'
).transform_filter(
datum.Rating_Status != 'NOT APPLICABLE'
)
#lowinc_v_black
inc_by_race = alt.hconcat(lowinc_v_white, lowinc_v_hisp, lowinc_v_black).resolve_scale(y='shared')
inc_by_race_rating = alt.vconcat(
).properties(
title={
"text": ["Schools not in good standing tend to be poor, and",
"majority white schools are probably not majority low-income"],
"subtitle": ["Chicago Public Schools: % Low-Income vs. % Race by School Performance",
"[Source: City of Chicago Data Portal]"],
"color": teal,
"subtitleColor": purple,
}
)
for rating_status in ['GOOD STANDING', 'INTENSIVE SUPPORT', 'PROVISIONAL SUPPORT']:
inc_by_race_rating &= inc_by_race.transform_filter(datum.Rating_Status == rating_status)
#inc_by_race_rating
lowinc_hisp_black = alt.Chart(cps_df).mark_point().encode(
x=alt.X('%low_inc', axis=alt.Axis(format='%', title='% Low Income')),
y=alt.Y('%hisp', axis=alt.Axis(format='%', title='% Hispanic')),
color=alt.Color('%black:Q', title='% Black', bin=alt.Bin(maxbins=5),
scale=alt.Scale(range=[mint, teal, purple, dark_pink, pink])),
shape=alt.Shape('%black:O', title=None,bin=alt.Bin(maxbins=5), sort='descending')
).properties(
title={
"text": ["Large clusters of very poor schools are either all Hispanic or all Black"],
"subtitle": ["Chicago Public Schools Student Population:",
"% Low-Income vs. % Hispanic vs. %Black",
"[Source: City of Chicago Data Portal]"],
"color": teal,
"subtitleColor": purple
}
).configure_title(
subtitleFontSize=12,
)
#lowinc_hisp_black
Each Chicago public school has information about student population demographics. There is also a performance metric ('Rating Status') by which each school's educational attainment is assessed. The following graphs indicate that underperforming schools tend to be very poor and most likely non-White.
Rating Status : Schools rated as needing intensive or provisional support are largely those with 80-100% low-income students and none of them have a sizeable White student population.
Race vs Income : Fifty percent or less of students in majority-White schools come from low-income families (only two exceptions). The low-income trend for Hispanic students is almost the complete opposite. Majority Hispanic schools are very likely to be majority low-income; the trend for Black student populations, though less stark, is similar.
inc_by_race_rating
Segregation?
Looking specifically at % Hispanic graphs, we can see two clusters of schools along the 100% low-income end: almost 0% or almost 100% Hispanic. In other words, there is a large number of completely poor schools that are either all Hispanic or zero Hispanic. You can see these clusters replicated in the % Black graphs.
I wondered if these clusters speak to the racial and socioeconomic segregation in the city of Chicago. Despite a number of mixed-race student populations, many poor Hispanic and Black students are probably relegated away in schools that reflect the racial makeup of their neighborhoods. I therefore guessed that the cluster of poor schools that are ~0% Hispanic are necessarily ~100% Black.
The following graphic provides a view that explores this idea.
lowinc_hisp_black
This graph provides another view of the all-or-nothing race composition in many of Chicago's schools. Looking at the largely poor schools (close to 100% low income), the cluster at the bottom, where ~0% Hispanic, seem to be mostly Black (light pink circles for >= 80% Black). Meanwhile the cluster at the top, where ~100% Hispanic, consists of schools that are 20% or less Black (mint diamonds). This shows a pattern of segregation in Chicago public schools, while simultaneously demonstrating the poor economic status of Hispanic and Black student populations.
type_white = alt.Chart(cps_df).mark_point().encode(
x=alt.X('%white', axis=alt.Axis(format='%', title='% White')),
y=alt.Y('Is_High_School:N', axis=alt.Axis(title='Is High School')),
color='Attendance_Boundaries:N',
size='Student_Count_Total:Q'
).properties(
width=500,
height=200,
title={
"text": ["There are no majority-White High Schools"],
"subtitle": ["Chicago Public Schools by School Type",
"[Source: City of Chicago Data Portal, CPS.edu School Data]"],
"color": teal,
"subtitleColor": purple
}
)
type_black = alt.Chart(cps_df).mark_point().encode(
x=alt.X('%black', axis=alt.Axis(format='%', title='% Black')),
y=alt.Y('Is_High_School:N', axis=alt.Axis(title='Is High School')),
color='Attendance_Boundaries:N',
size='Student_Count_Total:Q'
).properties(
width=500,
height=200
)
type_hisp = alt.Chart(cps_df).mark_point().encode(
x=alt.X('%hisp', axis=alt.Axis(format='%', title='% Hispanic')),
y=alt.Y('Is_High_School:N', axis=alt.Axis(title='Is High School')),
color='Attendance_Boundaries:N',
size=alt.Size('Student_Count_Total:Q', title='Student Population')
).properties(
width=500,
height=200
)
#type_white & type_black & type_hisp
chart_grad5yr = alt.Chart(cps_df).mark_rect().encode(
x=alt.X('Rate_Grad_5yr_2019', bin=alt.Bin(maxbins=30), axis=alt.Axis(format='%', title='Graduation Rate')),
y='majority_race:N',
color='count()'
).transform_filter(
(datum.Is_High_School == True) & (datum.Rate_Grad_5yr_2019 != 0)
).properties(
width=500,
height=200,
title={
"text": ["Majority Black schools are falling behind in graduation rates"],
"subtitle": ["Chicago Public Schools 5-yr Graduation Rates by Majority Race",
"[Source: City of Chicago Data Portal, CPS.edu School Data]"],
"color": teal,
"subtitleColor": purple
}
).configure_title(
subtitleFontSize=12,
)
#chart_grad5yr
chart_drop5yr = alt.Chart(cps_df).mark_point(shape='triangle', size=50).encode(
x=alt.X('Rate_Dropout_5yr_2019:Q', axis=alt.Axis(format='%', title='Dropout Rate')),
y='majority_race:N',
color='Rating_Status:N',
).transform_filter(
(datum.Is_High_School == True) & (datum.Rating_Status != 'NOT APPLICABLE')
).properties(
width=500,
height=200,
title={
"text": ["Chicago Public Schools 5-yr Dropout Rates", "by Majority Race"],
"subtitle": ["CPS seems to correctly identify high-dropout schools to be in need of intensive support.",
"Many of these schools are majority Black.",
"[Source: City of Chicago Data Portal, CPS.edu School Data]"],
"color": teal,
"subtitleColor": purple
}
).configure_title(
subtitleFontSize=12,
)
#chart_drop5yr
chart_mean5yr = alt.Chart(aggs_df).mark_rect().encode(
y=alt.Y('group:N', axis=alt.Axis(title='', labels=False)),
x=alt.X('city_mean:Q', axis=alt.Axis(format='%', title='Citywide Average')),
row='rate_description:N',
color='group:N'
).transform_filter(
alt.FieldOneOfPredicate(field='group', oneOf=['no_majority_HS_5yr_cohort',
'majority_black_HS_5yr_cohort',
'majority_hisp_HS_5yr_cohort'])
).transform_filter(
{'not': alt.FieldOneOfPredicate(field='rate_description', oneOf=['other_rate'])}
).properties(
width=300,
height=100,
title={
"text": ["Chicago Public Schools 5-yr Mean Outcomes by Majority Race", "(Citywide Average Graduation and Dropout Rates)"],
"subtitle": ["Majority Black high schools on average have a much higher dropout rate.",
"[Source: City of Chicago Data Portal, CPS.edu School Data]"],
"color": teal,
"subtitleColor": purple
}
).configure_title(
subtitleFontSize=12,
)
#chart_mean5yr
Chicago's public high schools, unlike its elementary schools, are much bigger than the elementary schools from which they draw. Many of them also have open boundaries, which means they accept students from any part of the city (Attendance_Boundaries = False). Such schools, however, tend to be highly competitive and often have selective enrollment.
Note that although there are a number of majority-White CPS elementary schools, there are only majority Black or majority Hispanic public high schools in Chicago. There are several ways this could be explained. Perhaps White students are transitioning into private schools, or perhaps they are getting absorbed into the larger competitive high schools that draw from all parts of the city (and indeed, the larger schools shown in the charts below are leaning towards higher percentages of White students). They could therefore be more likely to attend racially mixed schools with no one race majority.
type_white & type_black & type_hisp
Conversely, we can see in the charts above that neighborhood high schools (blue circles) are clustering towards opposite ends across each race spectrum, which again could be revealing the racial segregation of the city. If there were no segregation and races were evenly distributed across neighborhoods, we should expect to see blue high school circles around the 30% mark for each race, in proportion with Chicago's overall racial makeup. However, we see instead that for % Hispanic, there are more neighborhood schools >70% Hispanic.
Majority black high schools are lagging behind in terms of percent of students in the same freshman cohort graduating by their 5th year of high school.
chart_grad5yr
The drop out rates of Chicago's schools are generally too high for comfort, but it seems like CPS is correctly identifying schools with high dropouts to be in need of more support. However, there seems to be a lower threshold for this rating status with majority Black schools compared to majority Hispanic schools. This might be worrisome for majority Hispanic schools that may need more attention and resources.
chart_drop5yr
Consistent with the previous two charts, we observe that the average majority black high school in Chicago has a much higher dropout rate and much lower graduation rate than other schools. Majority Hispanic schools are also at a disadvantage compared to no-race-majority high schools.
chart_mean5yr
# Explore
print('SHAPE:', crime_df.shape)
print('COLUMNS:', crime_df.columns)
print('DATATYPES:', crime_df.dtypes)
print('UNIQUE VALUES - Offense Types:', crime_df['Primary Type'].nunique())
print('UNIQUE VALUES - Community Areas:', crime_df['Community Area'].nunique())
# Get a df counting crimes by Type within each Community Area
# Preview
hood_crime_type.head()
# Get a df counting total crimes within each Community Area
# Preview
hood_crimes
# Explore / Check Stats
# There are 2 crimes with NULL values for community area
print('SHAPE:', hood_crimes.shape)
print('TOTAL # OF CRIMES:', hood_crimes['crime_count'].sum())
print('MISSING VALUES FOR COMM_AREA:', crime_df['Community Area'].isna().sum())
print('HIGHEST # OF CRIMES IN ONE COMM_AREA:', hood_crimes['crime_count'].max())
Following steps adapted from: https://www.districtdatalabs.com/altair-choropleth-viz
# Load GeoJson of Community Area boundaries
comm_bounds.head()
# Merge crimes data with area boundaries data
crimes_gdf.head()
Following steps adapted from: https://www.districtdatalabs.com/altair-choropleth-viz
# Convert GeoPandas df back to GeoJson
def gen_geojson(geodataframe):
''' Converts GeoPandas dataframe back to GeoJson file that Altair can use for maps'''
#choro_json = json.loads(crimes_gdf.to_json())
#choro_data = alt.Data(values=choro_json['features'])
data = alt.InlineData(values = geodataframe.to_json(),
format = alt.DataFormat(property='features',
type='json'))
return data
# Generate map
def gen_map(geodata, color_column, title):
'''Generates map with crime choropleth and community area labels'''
# Add Base Layer
base = alt.Chart(geodata, title = title).mark_geoshape(
stroke='black',
strokeWidth=1
).encode(
).properties(
width=400,
height=400
)
# Add Choropleth Layer
choro = alt.Chart(geodata).mark_geoshape(
stroke='black'
).encode(
color=alt.Color(color_column,
type='quantitative',
#scale=alt.Scale(scheme='bluegreen'),
title = "Crime Count")
)
# Add Labels Layer
labels = alt.Chart(geodata).mark_text(baseline='top'
).properties(
width=400,
height=400
).encode(
longitude='properties.centroid_lon:Q',
latitude='properties.centroid_lat:Q',
text='properties.community_area:O',
size=alt.value(8),
opacity=alt.value(1)
)
return base + choro + labels
choro_data = gen_geojson(crimes_gdf)
crime_2018_map = gen_map(geodata=choro_data, color_column='properties.crime_count', title='2018')
crime_2018_map